Downloded the data :
options(scipen = 999)
flights <- as.data.frame(nycflights13::flights)
wheather <- nycflights13::weather
airports <- nycflights13::airports
planes <- nycflights13::planes
The first graph shows us the percentage of flights that departed over a 15-minute delay divided into each destination individually. The purpose of this graph is to make the reader’s eye quickly find the percentage of flights based on a visualization of the United States map.
The second graph shows the weekly flight cycles throughout 2008 where the blue line indicates the total number of flights that took off that day and the red line indicates the number of flights in which there was a delay of over 15 minutes.
In the first graph the information is not presented in an ideal way, it is difficult for the reader to identify where each line is coming from, although it is possible to understand which countries it is but it is not always possible to identify exactly which airport it is.
In the second graph, in our opinion, the amount of lines and points confuses the reader and makes it difficult for him to identify the patterns of flight delays throughout the seasons. In addition, the information overload makes it difficult to identify recurring latency patterns on certain days for the length of the weekly / monthly cycle.
The first graph raises the obvious question: Are there countries where the average latency is higher than in other countries? In addition, which are the airports where the percentage of delays is higher? Another question is whether the percentages of delay measured vary according to the seasons?
The second graph raises the questions: Are there patterns of lateness that recur on certain days of the week or in a particular season, or are there lateness in some months than in other months? In addition, if the observation was lower by day / month, it would be possible to compare percentages and learn from the data by comparing proportions.
The first graph can be improved by changing the graph to a dynamic graph. This means that when the mouse points to a certain line, we will be presented with the values of the airport we are pointing to.
In the second graph it was possible to improve the visualization by changing the red bars to a long line connecting the dots which makes it possible to identify patterns in the delays of the flights.
A graphic summarizing the flight volume and flights delayed, broken by day and showing weekly cycles.
#EWR/ LGA/ JFK to choose
week_cycles <- as.data.frame(flights) %>% filter(origin == 'EWR') %>% dplyr::select(time_hour ,dep_delay ,sched_dep_time)
week_cycles$time_hour <- as.Date(week_cycles$time_hour)
week_cycles<- week_cycles %>% mutate(delay = if_else(dep_delay <= 15 ,0,1,0))
week_cycles_freq <- week_cycles %>% group_by(time_hour) %>% summarise(frequency = n())
week_cycles_delay <- week_cycles %>% group_by(time_hour) %>% summarise(sum(delay))
l <- length(week_cycles_freq$frequency)
week_cycles_min <- week_cycles_freq[c(2:(l-1)),]
week_cycles_min <- week_cycles_min %>% mutate(local_min = (week_cycles_min$frequency == runmin(week_cycles_min$frequency,length(week_cycles_min$frequency)/2)))
week_cycles_min <- week_cycles_min %>% filter(local_min == T) %>% filter(frequency < 250)
ggplot(data = week_cycles_freq,aes(x = time_hour,
y= week_cycles_freq$frequency,colour = "blue")) +
geom_point() + geom_line(aes(color = "blue")) +
geom_point(data = week_cycles_delay,aes(x=time_hour,y= `sum(delay)`), color ='red') +
geom_linerange(data = week_cycles_delay,aes(x= time_hour, ymax =`sum(delay)` ,ymin=0,color = 'red')) +
geom_point(data = week_cycles_min,aes(x=time_hour,y=week_cycles_min$frequency,
color = "lightblue"),size = 7)+
scale_color_identity(name ="" ,breaks = c("blue","red","lightblue"),
labels = c("All Flights (scheculed for departure)",
"Late Flights (departure delayed >15)", "Fewer flights"),
guide = guide_legend(override.aes = list(linetype = c(1, 1, 0),
shape = c(NA,NA,16),
size = c(1,1,7)))) +
theme_light() +
xlab("Date")+ ylab("Flights per day") +
theme(legend.position="top",legend.direction = "vertical") +
ggtitle(label = "Weekly Cycles", subtitle = "the airport, we need to choose, Year = 2013") +
theme(plot.title = element_text(hjust = 0.5,size = 20),plot.subtitle = element_text(hjust = 0.5,size = 15))
The other graph, flight percent
dep_delay <- as.data.frame(flights) %>% filter(origin == 'EWR') %>%
mutate(delay = if_else(dep_delay <= 15 ,0,1,0)) %>% group_by(faa = dest) %>% summarise(amount = (frequency = n()), dep = sum(delay)) %>% mutate(per = dep/amount)
dep_delay_loc <- dep_delay %>% left_join(airports, by = "faa") %>% drop_na()
# removed : STT,SJU, BQN need to be explain why they can be found.
dep_delay_loc <- dep_delay_loc %>% mutate(per_ch = NA)
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.10 ] <- "<= 10%"
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.15 & dep_delay_loc$per > 0.10 ] <- "10% - 15%"
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.20 & dep_delay_loc$per > 0.15 ] <- "15% - 20%"
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.25 & dep_delay_loc$per > 0.20 ] <- "20% - 25%"
dep_delay_loc$per_ch[dep_delay_loc$per >= 0.25] <- "25% <="
dep_delay_loc$per_ch <- factor(dep_delay_loc$per_ch, levels = c("<= 10%","10% - 15%",
"15% - 20%","20% - 25%","25% <="))
dep_delay_loc <- dep_delay_loc %>% mutate(nyc_EWR_lon = -74.16867) %>% mutate(nyc_EWR_lat = 40.6925 )
dep_delay_trans <- dep_delay_loc %>% select(lon,lat) %>% usmap_transform()
dep_delay_loc$lon <- dep_delay_trans$lon.1
dep_delay_loc$lat <- dep_delay_trans$lat.1
dep_delay_trans <- dep_delay_loc %>% select(nyc_EWR_lon,nyc_EWR_lat) %>% usmap_transform()
dep_delay_loc$lnyc_EWR_lon <- dep_delay_trans$nyc_EWR_lon.1
dep_delay_loc$lnyc_EWR_lat <- dep_delay_trans$nyc_EWR_lat.1
row.names(dep_delay_loc) <- dep_delay_loc$name
plot_del <- plot_usmap(regions = "states",labels = TRUE,exclude = c("AK","HI"), size = 0.5,
label_color = "grey",
color = "grey") +
theme(panel.background=element_blank()) +
geom_segment(data = dep_delay_loc, aes(xend = lon,yend = lat, x = lnyc_EWR_lon,
y = lnyc_EWR_lat ,colour = per_ch),size = 0.5) +
geom_point(data = dep_delay_loc, aes(x = lon , y = lat, colour = per_ch, text = row.names(dep_delay_loc)),
size = 0.5, show.legend = F) +
scale_color_manual(name = "",values = c("green","purple","blue","orange","red"),
guide = guide_legend(override.aes = list(linetype = c(1, 1, 1,1,1),
size = c(1.5,1.5,1.5,1.5,1.5))))+
labs(title = '<b>% of Flight Departures Delayed > 15 Min</b><br> Airport = EWR Year = 2013', caption = "Click line endpoint to see that airport departures") +
theme(plot.title = element_text(hjust = 0.5,size = 13),
plot.subtitle = element_text(hjust = 0.5,size = 10))
ggplotly(plot_del, tooltip = "text") %>% layout(legend = list(x = 0, y = 0))
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
The first graph shows the relationship between visibility and flight delays.
flights_delayed <- filter(flights, dep_delay >15 ) #We want to see only the delayed flights
flights_delayed1 <- flights_delayed %>% group_by(time_hour) %>% summarise(Avg_dep_delay = mean(dep_delay))
Wheather_cond <- left_join(flights_delayed1, wheather) #combine the two relevant data frames
visib_df <- Wheather_cond %>% group_by(visib) %>% summarise(Avg_dep_delay = mean(Avg_dep_delay))
ggplot(data = visib_df, mapping = aes(x = visib, y = Avg_dep_delay)) + geom_point(color = "violetred4") +
geom_smooth(method = "lm", fill = "violet", color = "violetred4") +
labs(x = "visibility", y="Average Departure Delay Time (minutes)",
title = "Average Departure Delay vs. Quality of visibility")
When we tried to think what was the main reason for the delay of flights the first thing that came to our mind was the weather conditions. When we studied the weather database we noticed a variable called “visib” and decided to try to understand the relationship between visibility and flight delays. The graph shown above shows a direct relationship between the quality of visibility and the average flight delay in minutes. The better the visibility, the smaller the average delays.
The second graph
לא עבד נמשיך מחר :(( ))
flights_delayed2 <- filter(flights, arr_delay >0 )
merges_planes <- merge(flights, planes, by = "tailnum")
merges_planes <- mutate(merges_planes, Age = year.x - year.y) %>% mutate(Age = if_else(Age > 25, 25L, Age))
flights_delayed_plane <- merges_planes %>% group_by(Age) %>% summarise(Avg_dep_delay = mean(arr_delay, na.rm = T), n = n_distinct(arr_delay, na.rm = T)) %>% filter(!is.na(Age))
flights_delayed_plane <- filter(flights_delayed_plane, n >30)
# ggplot(data = flights_delayed_plane, mapping = aes(x = Age, y = Avg_dep_delay)) + geom_point(color = "violetred4") + geom_smooth( fill = "violet", color = "violetred4")
#
#
# ggplot(data = merges_planes) +
# geom_boxplot(mapping = aes(x = manufacturer, y = dep_delay)) + coord_flip()
#Creat a plot that shows the distribution of the delayed flights in each day of every month
ggplot(data = flights_delayed, aes(x = day)) +
geom_bar(aes(fill = month))+ facet_wrap(~month) +
labs(title = "Distribution of delayed flight in each month",
subtitle = "(Delayed flights are set to be flights that left the airport with over 15 minutes delay)", y = "Count of delayed flight", x = "Day") + scale_fill_viridis_c(guide = F)
#First' we created a df that containes only the data that we need
by_month <- flights_delayed %>% group_by(month) %>% summarise(val = n_distinct(dep_delay))
by_month <- data.frame(by_month, semp = c(rep(1,12))) #named the real data as sample 1.
sampled_df <- by_month
row.names(sampled_df) <- by_month$month
#loop that sample new vecrots and tag the sample with number
#also each sample will merged to the data that containes the real values.
for (i in (2:12)){
x <- data.frame(month = c(1:12), semp = c(rep(i,12)),val = sample(by_month$val))
sampled_df <- full_join(sampled_df, x)
}
#plot all of the samples in the Line-up method
# we added colors that will help us identify the pattern of the values.
ggplot(data = sampled_df, mapping = aes(x = month, y = val, fill = val)) + facet_wrap(~semp) +
scale_x_continuous(breaks = c(1:12)) +
geom_bar(stat="identity") +
labs(title = "Simulated data-sets of delayed flight using line-up",
subtitle = "(Delayed flights are set to be flights that left the airport with over 15 minutes delay)", y = "Count of delayed flight", x = "Month")+
theme(axis.text.x = element_text(size = 6)) +
scale_fill_distiller(palette = 'RdPu', direction =1)
BLA BLA BLA